---
title: "Replication data for Bureaucratic Responsiveness in Humanitarian Aid, Appendix A and B"
output: html_notebook
---

This is the R code used for the data analyses presented in Appendix A and Appendix B of the article "Bureaucratic Responsiveness in Humanitarian Aid".

The data here is used to replicate Tables A3 and A4, as well as all of the figures presented in Appendix B. 


First load the environment and the document term matrix containing the New York Times articles, "tm_col". 

```{r}
Sys.setenv(LANGUAGE = "en")


library(dplyr)
library(tidyverse)
library(quanteda)
library(quanteda.textstats)
library(ggplot2)
library(topicmodels)
library(ldatuning)
library(doParallel)
library(textmineR)
library(wesanderson)
library(tidytext)
library(gridExtra)
library(ape)
library(scales)


load(file = "tm_col_DTM.rds")

```



Use the ldatuning package to identify the optimal number of topics "k" for the topic model. 

* === Important note: This process will take a considerable amount of time. Total computational time exceeded 48 hours and had to be broken into multiple small chunks in order to prevent data storage problems. If you would like to skip this step you can find the loaded results in the next chunk. === *

```{r}
#note this will take several days of computing time on a standard PC or desktop computer. 
tunes <- FindTopicsNumber(
    tm_col,
    topics = c(1:10 * 10, 120, 140, 160, 180, 0:5 * 50 + 200),
    metrics = c("Griffiths2004", "CaoJuan2009", "Arun2010"),
    method = "Gibbs",
    control = list(seed = 77),
    mc.cores = 4L,
    verbose = TRUE)
```

If skipping the previous step, load the completed results here. 

```{r}

load("tunes_topicnumbers.rds")

```

Plot the tuning results.

The results should match those presented in "Figure B1. A depiction of topic model tuning".

```{r}
FindTopicsNumber_plot(tunes)

```

Evaluate potential topic models using a 5-fold cross-validation of perplexity. 

* === Important Note: This step will take several hours to process. As with the previous step, you can skip ahead to the next chunk to view the completed analysis. === *

```{r}

#set possible values for k
candidate_k <- c(10, 20, 50, 100, 0:5 * 50 + 200)


cluster <- makeCluster(detectCores(logical = TRUE) - 1) # leave one CPU spare...
registerDoParallel(cluster)

clusterEvalQ(cluster, {
  library(topicmodels)
})

burnin = 1000
iter = 1000
keep = 50
n <- nrow(tm_col)
folds <- 5 
splitfolds <- sample(1:folds, n, replace = TRUE)
clusterExport(cluster, c("tm_col", "burnin", "iter", "keep", "splitfolds", "folds", "candidate_k"))

set.seed(1122)
system.time({
 val_results_3 <- foreach(j = 1:length(candidate_k), .combine = rbind) %dopar%{
    k <- candidate_k[j]
    results_1k <- matrix(0, nrow = folds, ncol = 2)
    colnames(results_1k) <- c("k", "perplexity")
    for(i in 1:folds){
      train_set <- tm_col[splitfolds != i , ]
      test_set <- tm_col[splitfolds == i, ]
      
      fitted <- LDA(train_set, k = k, method = "Gibbs",
                    control = list(burnin = burnin, iter = iter, keep = keep) )
      results_1k[i,] <- c(k, perplexity(fitted, newdata = test_set))
    }
    return(results_1k)
  }
})
stopCluster(cluster)
```

Load the completed perplexity results. 

```{r}
load(file = "perplexity_results.rds")
```

Plot the perplexity results. 

The results presented here should match those found in "Figure B2. A depiction of perplexity scores for topic models".  

```{r}

#plot perplexity results
ggplot(perp10_400, aes(x = k, y = perplexity)) +
  geom_point() +
  geom_smooth(se = FALSE) +
  labs(x = "Number of topics", y = "Perplexity")
```

Now create the topic model using the optimal value, k=250. 

* === Important Note: This will take several hours. If you would like to skip ahead, you can load the completed topic model in the next section. === *

```{r}
require(topicmodels)
K <- 250
topicModel <- LDA(tm_col, K,
                  method = "Gibbs",
                  control=list(seed=1234), 
                  alpha = 0.1)
```

If skipping the manual calculation, load the completed topic model. 

```{r}
load(file = "LDAtopicmodel_k250.rds")
```


Identify topic clusters using Hellinger distance.

The branch clusters of the phylogenetic tree correspond to the topic clusters identified in Table A3 and Table A4. 

```{r}
topic_linguistic_dist <- CalcHellingerDist(topicModel@beta)
hclust <- hclust(as.dist(topic_linguistic_dist), "ward.D")

phylo <- as.phylo(hclust)
plot(as.phylo(hclust), type = "unrooted", cex = 0.5,
     lab4ut = "axial",
     no.margin = TRUE)

```


Create a data frame compiling topic assignments, cluster assignments, and top five terms. 

These results should correspond with those found in Table A3. Interpretation of topic clusters is human-assigned. 

```{r}

top_terms <- terms(topicModel, 5)  

top_terms_df <- data.frame(
  topic = seq_len(ncol(top_terms)),  # Ensures integer topic index (1, 2, 3, ...)
  top_terms = apply(top_terms, 2, function(x) paste(x, collapse = ", ")))


#define clusters using the phylogenetic branches
clusters <- list(
  c1 = as.integer(c("120", "218", "85", "98", "249", "115", "122", "143", "171", "69", "72", "4", "159", "103", "156", "8", "194", "76")),
  c2 = as.integer(c("56", "183", "27", "87", "139", "222", "5", "154", "33", "111", "181", "246", "16", "19", "74", "176", "34", "90")),
  c3 = as.integer(c("93", "211", "42", "133", "28", "31", "41", "96", "92", "12", "182")),
  c4 = as.integer(c("119", "242", "52", "26", "39", "64", "248", "79", "54", "77", "91", "179", "94", "124")),
  c5 = as.integer(c("202", "230", "175", "234", "190", "200", "62")),
  c6 = as.integer(c("86", "22", "58", "100", "2")),
  c7 = as.integer(c("197", "53", "206", "21", "99", "84")),
  c8 = as.integer(c("109", "157", "108", "239", "214", "164", "174", "195", "153")),
  c9 = as.integer(c("144", "102", "148", "65", "204", "59", "25", "228", "61", "49", "29", "127", "245", "128", "198", "106", "15", "235", "141", "35", "113", "110", "36", "24", "201", "238", "227", "117")),
  c10 = as.integer(c("130", "125", "226", "203", "78", "250", "215", "138", "67", "142", "97", "217", "172", "150", "208", "129", "212", "23", "229", "6", "225", "132")),
  c11 = as.integer(c("220", "145", "186", "160", "38", "178", "75", "131", "168", "83", "73", "30")),
  c12 = as.integer(c("196", "213", "68", "223", "184", "17", "210", "14", "192", "7", "244", "82", "240", "193", "152", "71", "9", "167", "219", "101", "80", "233", "224", "18", "123", "45", "20", "237", "216", "50", "13", "207", "89")),
  c13 = as.integer(c("158", "114", "51", "1", "170", "95", "63")),
  c14 = as.integer(c("205", "166", "11", "161", "81", "231", "149")),
  c15 = as.integer(c("10", "66", "57", "70", "116", "236", "37", "60", "40", "55", "32", "169", "126", "180", "47", "185", "191", "189", "107", "140", "121", "187")),
  c16 = as.integer(c("147", "177", "48", "232", "188", "155", "163", "137", "88", "105", "173", "146", "162", "46", "243", "199", "3", "44", "104", "209", "151", "43", "135", "221", "118", "136", "241", "112", "165", "134", "247"))
)

#create a data frame for clusters
clustered_topics <- do.call(rbind, lapply(seq_along(clusters), function(i) {
  data.frame(
    cluster = paste0("Cluster_", i),
    topic = clusters[[i]]
  )
}))

#merge with top terms using correct topic index
clustered_topics_with_terms <- merge(clustered_topics, top_terms_df, by = "topic", all.x = TRUE)

#sort
clustered_topics_with_terms <- clustered_topics_with_terms[order(clustered_topics_with_terms$cluster, clustered_topics_with_terms$topic), ]

# View result
print(clustered_topics_with_terms)
```

Calculate the average annual word counts for cases within the data set. 

These results should match those found in "Figure B3 - A depiction of the average annual word count per case".

```{r}

#load the data set
data <- read.csv(file = "maindata_fpa_replication.csv")

#relabel shortened entries
data <- data %>%
  mutate(case = recode(case,
                       "Afghan" = "Afghanistan",
                       "Gaza" = "Gaza/West Bank"))

df <- data %>%
  group_by(case) %>%
  mutate(avg_words = mean(med_words)) %>%
  ungroup() %>%
  select(case, avg_words) %>%
  pivot_longer(cols = -case, names_to = "variable", values_to = "value") %>%
  unique()

ggplot(df, aes(fill=case, x=reorder(case, value), y = value)) +
  geom_col() +
  theme(legend.position = "none") +
  theme(axis.text = element_text(angle=90)) + 
  scale_fill_manual(values = 
                      wes_palette(n = 53, "Chevalier1", type = "continuous")) +
  xlab("Country") +
  ylab("Average Annual NYT Word Count")


```

Calculate the average annual media diversity (entropy) per case. 

These results should match those found in "Figure B4 - A depiction of the average annual media diversity per case".

```{r}
df <- data %>%
  group_by(case) %>%
  mutate(avg_med_h = mean(med_h)) %>%
  ungroup() %>%
  select(case, avg_med_h) %>%
  gather(key = "variable", value = "value", -case) %>%
  unique()

ggplot(df, aes(fill=case, x=reorder(case, value), y = value)) +
  geom_col() +
  theme(legend.position = "none") +
  theme(axis.text = element_text(angle=90)) + 
  scale_fill_manual(values = 
                      wes_palette(n = 53, "Chevalier1", type = "continuous")) +
  xlab("Country") +
  ylab("Average Annual Media Entropy") + 
  coord_cartesian(ylim = c(0.5, 0.95))
```

Calculate media attention to particular topic clusters at differing levels of media volume and diversity. 

These results should match those found in "Figure B5 - A depiction of media attention to particular topic clusters after excluding regional and junk topics"

```{r}
#load original articles to extract document word counts
raw_articles <- read.csv(file = "raw_articles.csv", fileEncoding = "UTF-16LE")

raw_articles <- data.frame(raw_articles, stringsAsFactors = FALSE)

#convert to a quanteda corpus and extract docvars
corp <- corpus(raw_articles, text_field = "Article")

doc_info <- docvars(corp) %>%
  mutate(document = docid(corp))

#clean up the labels in doc_info
doc_info <- doc_info %>%
  mutate(filename = str_remove(basename(Source_File), "\\.docx$")) %>%
  mutate(
    year = str_extract(filename, "\\d{4}(?:_\\d+)?$"),               
    case = str_remove(filename, " \\d{4}(?:_\\d+)?$")                
  )

doc_info <- doc_info %>%
  mutate(case = case_when(
    case == "West Bank" ~ "Gaza",
    TRUE ~ case
  ))

#tidy the topic model
tidy_docs <- tidy(topicModel, matrix = "gamma")

#join to doc_info
tidy_docs <- tidy_docs %>%
  left_join(doc_info %>% select(case, year, document, Length), by = "document")

#use gamma values to calculate topic-focused coverage
tidy_docs <- tidy_docs %>%
  mutate(topic_words = gamma * Length)

topic_word_summary <- tidy_docs %>%
  group_by(case, year, topic) %>%
  summarise(total_topic_words = sum(topic_words, na.rm = TRUE),
            .groups = "drop")

#set the Hellinger distance-derived topic clusters
c1 <- as.integer(c("120", "218", "85", "98", "249", "115", "122", "143", "171", "69", "72", "4", "159", "103", "156", "8", "194", "76"))
c2 <- as.integer(c("56", "183", "27", "87", "139", "222", "5", "154", "33", "111", "181", "246", "16", "19", "74", "176", "34", "90"))
c3 <- as.integer(c("93", "211", "42", "133", "28", "31", "41", "96", "92", "12", "182"))
c4 <- as.integer(c("119", "242", "52", "26", "39", "64", "248", "79", "54", "77", "91", "179", "94", "124"))
c5 <- as.integer(c("202", "230", "175", "234", "190", "200", "62"))
c6 <- as.integer(c("86", "22", "58", "100", "2"))
c7 <- as.integer(c("197", "53", "206", "21", "99", "84"))
c8 <- as.integer(c("109", "157", "108", "239", "214", "164", "174", "195", "153"))
c9 <- as.integer(c("144", "102", "148", "65", "204", "59", "25", "228", "61", "49", "29", "127", "245", "128", "198", "106", "15", "235", "141", "35", "113", "110", "36", "24", "201", "238", "227", "117"))
c10 <- as.integer(c("130", "125", "226", "203", "78", "250", "215", "138", "67", "142", "97", "217", "172", "150", "208", "129", "212", "23", "229", "6", "225", "132"))
c11 <- as.integer(c("220", "145", "186", "160", "38", "178", "75", "131", "168", "83", "73", "30"))
c12 <- as.integer(c("196", "213", "68", "223", "184", "17", "210", "14", "192", "7", "244", "82", "240", "193", "152", "71", "9", "167", "219", "101", "80", "233", "224", "18", "123", "45", "20", "237", "216", "50", "13", "207", "89"))
c13 <- as.integer(c("158", "114", "51", "1", "170", "95", "63"))
c14 <- as.integer(c("205", "166", "11", "161", "81", "231", "149"))
c15 <- as.integer(c("10", "66", "57", "70", "116", "236", "37", "60", "40", "55", "32", "169", "126", "180", "47", "185", "191", "189", "107", "140", "121", "187"))
c16 <- as.integer(c("147", "177", "48", "232", "188", "155", "163", "137", "88", "105", "173", "146", "162", "46", "243", "199", "3", "44", "104", "209", "151", "43", "135", "221", "118", "136", "241", "112", "165", "134", "247"))

#combine into list
topic_clusters <- list(c1=c1, c2=c2, c3=c3, c4=c4, c5=c5, c6=c6, c7=c7, c8=c8,
                       c9=c9, c10=c10, c11=c11, c12=c12, c13=c13, c14=c14, c15=c15, c16=c16)

#convert to data frame
topic_cluster_df <- bind_rows(
  lapply(names(topic_clusters), function(cluster_name) {
    data.frame(topic = topic_clusters[[cluster_name]], cluster = cluster_name)
  })
)

#calculate volume of coverage by cluster
topic_word_summary <- topic_word_summary %>%
  mutate(topic = as.integer(topic)) %>% 
  left_join(topic_cluster_df, by = "topic")

cluster_word_summary <- topic_word_summary %>%
  group_by(case, year, cluster) %>%
  summarise(total_cluster_words = sum(total_topic_words, na.rm = TRUE),
            .groups = "drop")

#read in main data set and join
data <- read.csv(file = "maindata_fpa_replication.csv")

data <- data %>%
  mutate(case = case_when(
    case == "Afghan" ~ "Afghanistan",
    TRUE ~ case
  ))

data <- data %>%
  mutate(year = substr(as.character(year), 1, 4))

joined_data <- cluster_word_summary %>%
  right_join(data, by = c("case", "year"))

#drop cases receiving no media attention whatsoever
joined_data <- joined_data %>%
  filter(med_words != 0) %>%
  filter(med_words != 0.1)

#reorder by word counts and add quartile labels for visualization
joined_data$total_cluster_words[is.na(joined_data$total_cluster_words)] <- 0
joined_data$cluster <- reorder(joined_data$cluster, joined_data$total_cluster_words)

joined_data <- joined_data %>%
  mutate(
    med_words_quartile = case_when(
      med_words >= quantile(med_words, 0.75, na.rm = TRUE) ~ "Top 25%",
      med_words <= quantile(med_words, 0.25, na.rm = TRUE) ~ "Bottom 25%",
      TRUE ~ "Middle 50%"
    ),
    med_h_quartile = case_when(
      med_h >= quantile(med_h, 0.75, na.rm = TRUE) ~ "Top 25%",
      med_h <= quantile(med_h, 0.25, na.rm = TRUE) ~ "Bottom 25%",
      TRUE ~ "Middle 50%"
    )
  )

#create data frames for visualization
#drop junk topics for visualization
bottom_volume <- joined_data %>%
  filter(med_words_quartile == "Bottom 25%") %>%
  group_by(cluster) %>%
  summarize(c_words = sum(total_cluster_words, na.rm = TRUE), .groups = "drop") %>%
  mutate(
    total_words = sum(c_words),
    percent = c_words / total_words
  ) %>%
  filter(!cluster %in% c("c1", "c13", "c14", "c15", "c16"))

top_volume <- joined_data %>%
  filter(med_words_quartile == "Top 25%") %>%
  group_by(cluster) %>%
  summarize(c_words = sum(total_cluster_words, na.rm = TRUE), .groups = "drop") %>%
  mutate(
    total_words = sum(c_words),
    percent = c_words / total_words
  ) %>%
  filter(!cluster %in% c("c1", "c13", "c14", "c15", "c16"))

bottom_diversity <- joined_data %>%
  filter(med_h_quartile == "Bottom 25%") %>%
  group_by(cluster) %>%
  summarize(c_words = sum(total_cluster_words, na.rm = TRUE), .groups = "drop") %>%
  mutate(
    total_words = sum(c_words),
    percent = c_words / total_words
  ) %>%
  filter(!cluster %in% c("c1", "c13", "c14", "c15", "c16"))

top_diversity <- joined_data %>%
  filter(med_h_quartile == "Top 25%") %>%
  group_by(cluster) %>%
  summarize(c_words = sum(total_cluster_words, na.rm = TRUE), .groups = "drop") %>%
  mutate(
    total_words = sum(c_words),
    percent = c_words / total_words
  ) %>%
  filter(!cluster %in% c("c1", "c13", "c14", "c15", "c16"))

#determine the max percent value across all datasets
max_percent <- max(
  bottom_volume$percent,
  top_volume$percent,
  bottom_diversity$percent,
  top_diversity$percent
)

#create bar graphs
bar1 <- ggplot(bottom_volume, aes(x = cluster, y = percent, fill = cluster)) +
  geom_col(width = 1, color = "black") +
  theme(
    axis.ticks = element_blank(),
    axis.title = element_blank(),
    axis.text = element_text(size = 15),
    axis.text.y = element_text(angle = 90),
    legend.position = "none",
    panel.background = element_rect(fill = "white")
  ) +
  scale_fill_manual(
    values = wes_palette(n = length(unique(bottom_volume$cluster)), name = "Chevalier1", type = "continuous")
  ) +
  scale_y_continuous(
    labels = percent_format(accuracy = 1),
    limits = c(0, max_percent)
  )

bar2 <- ggplot(top_volume, aes(x = cluster, y = percent, fill = cluster)) +
  geom_col(width = 1, color = "black") +
  theme(
    axis.ticks = element_blank(),
    axis.title = element_blank(),
    axis.text = element_text(size = 15),
    axis.text.y = element_blank(),
    legend.position = "none",
    panel.background = element_rect(fill = "white")
  ) +
  scale_fill_manual(
    values = wes_palette(n = length(unique(top_volume$cluster)), name = "Chevalier1", type = "continuous")
  ) +
  scale_y_continuous(
    labels = percent_format(accuracy = 1),
    limits = c(0, max_percent)
  )

bar3 <- ggplot(bottom_diversity, aes(x = cluster, y = percent, fill = cluster)) +
  geom_col(width = 1, color = "black") +
  theme(
    axis.ticks = element_blank(),
    axis.title = element_blank(),
    axis.text = element_text(size = 15),
    axis.text.y = element_text(angle = 90),
    legend.position = "none",
    panel.background = element_rect(fill = "white")
  ) +
  scale_fill_manual(
    values = wes_palette(n = length(unique(bottom_diversity$cluster)), name = "Chevalier1", type = "continuous")
  ) +
  scale_y_continuous(
    labels = percent_format(accuracy = 1),
    limits = c(0, max_percent)
  )

bar4 <- ggplot(top_diversity, aes(x = cluster, y = percent, fill = cluster)) +
  geom_col(width = 1, color = "black") +
  theme(
    axis.ticks = element_blank(),
    axis.title = element_blank(),
    axis.text = element_text(size = 15),
    axis.text.y = element_blank(),
    legend.position = "none",
    panel.background = element_rect(fill = "white")
  ) +
  scale_fill_manual(
    values = wes_palette(n = length(unique(top_diversity$cluster)), name = "Chevalier1", type = "continuous")
  ) +
  scale_y_continuous(
    labels = percent_format(accuracy = 1),
    limits = c(0, max_percent)
  )

# === Figure B4 === #
grid.arrange(bar1, bar2, bar3, bar4)



```

